In [1]:
# import relevant modules
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
import ast
import glob
from tqdm.notebook import tqdm
import plotly.express as px
In [2]:
# load dfs
topics_df = pd.read_csv('../../data/BTW17_Twitter/lda/topics.csv')
topics_df.drop(columns='Unnamed: 0', inplace=True)
hashtags = topics_df['hashtag'].tolist()

similarity_df = pd.read_csv('../../data/BTW17_Suggestions/suggestions/topic_similarity.csv')
similarity_df.drop(columns='Unnamed: 0', inplace=True)
similarity_df['similarity_scores'] = similarity_df['similarity_scores'].apply(lambda x: ast.literal_eval(str(x)))
similarity_df['hashtags'] = [hashtags for i in similarity_df.index]
similarity_df['suggestion'] = similarity_df['suggestion'].apply(lambda x: ' '.join(ast.literal_eval(x)))

temp_df = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
temp_df['date'] = pd.to_datetime(temp_df['date']).dt.date
suggestions_df = pd.DataFrame()
suggestions_df[['date', 'queryterm', 'suggestion', 'count']] = temp_df.groupby(['date', 'queryterm', 'suggestion'], as_index=False).count()
In [3]:
# join cluster and group again
suggestions_df = suggestions_df.merge(similarity_df, how='inner', on='suggestion')
suggestions_df = suggestions_df.groupby(['date', 'queryterm', 'cluster'], as_index=False).sum('count')
In [4]:
# explode and group and filter cluster similarity to hashtags
similarity_df = similarity_df.set_index(['suggestion', 'cluster']).apply(pd.Series.explode).reset_index()
similarity_df['similarity_scores'] = pd.to_numeric(similarity_df['similarity_scores'])
similarity_df = similarity_df.groupby(['cluster', 'hashtags'], as_index=False).mean('similarity_scores')
similarity_df = similarity_df[similarity_df['similarity_scores']>=0.5]
In [5]:
# set to *.csv to process all
path_to_csv = '../../data/BTW17_Twitter/hashtags/*.csv'
file_list = glob.glob(path_to_csv)

hashtag_df = pd.DataFrame()

# concatenate dataframes
for index in tqdm(range(len(file_list))):
    file = file_list[index]
    
    # read file to df if first file
    if index==0:
        hashtag_df = pd.read_csv(file)
        hashtag_df.drop('Unnamed: 0', axis=1, inplace=True)
    
    # append to df if not first file
    else:
        df2 = pd.read_csv(file)
        df2.drop('Unnamed: 0', axis=1, inplace=True)
        hashtag_df = hashtag_df.append(df2)
In [6]:
hashtag_df.head(3)
Out[6]:
date hashtag count
0 2017-08-02 abgas 1
1 2017-08-02 abschiebung 1
2 2017-08-02 abschiebungen 2
In [7]:
suggestions_df.head(3)
Out[7]:
date queryterm cluster count
0 2017-05-29 achim post 8 14
1 2017-05-29 achim post 18 20
2 2017-05-29 achim post 20 16
In [8]:
similarity_df.head(3)
Out[8]:
cluster hashtags similarity_scores
7 0 ard 0.533413
14 0 bonn 0.508825
15 0 brandenburg 0.531397
In [9]:
sample25 = similarity_df.sample(n=25, random_state=1).reset_index()
sample25.head(3)
Out[9]:
index cluster hashtags similarity_scores
0 2566 15 steineke 0.520255
1 4317 26 münchen 0.523902
2 5741 35 grüne 0.500015
In [10]:
for i in tqdm(range(len(sample25))):
    cluster = sample25['cluster'][i]
    hashtag = sample25['hashtags'][i]
    df = suggestions_df[suggestions_df['cluster']==cluster][['date', 'cluster', 'count']]
    df2 = hashtag_df[hashtag_df['hashtag']==hashtag][['date', 'hashtag', 'count']]
    df = df.merge(df2, how='outer', on='date', suffixes=('_suggestion', '_hashtag'))
    df['count_hashtag'].fillna(0, inplace=True)
    df['count_suggestion'].fillna(0, inplace=True)
    
    df['date'] = df['date'].apply(lambda x: str(x))
    df = df.groupby(['date'], as_index=False).sum(['count_suggestion', 'count_hashtag'])
    df = df.sort_values(by='date')
    
    df['count_hashtag'] = df['count_hashtag'] / df['count_hashtag'].sum()
    df['count_suggestion'] = df['count_suggestion'] / df['count_suggestion'].sum()
    
    fig = px.line(df, x='date', y=['count_hashtag', 'count_suggestion'],
                  title=f'hashtag: {hashtag}, cluster: {cluster}, score: {sample25["similarity_scores"][i]}',
                  template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
    
    fig.show()